
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re as re
from textacy import preprocessing as pp
import numpy as np


def import_dataset_oems():
    # Import patenting auto firms
    OEMs = pd.read_csv(PATHS.marklines / 'OEMs.csv')
    OEMs1 = OEMs[['Level1_OrbisName', 'Level1_bvdid']].drop_duplicates().rename(columns={'Level1_OrbisName': 'bvdid_name', 'Level1_bvdid': 'bvdid'})
    OEMs2 = OEMs[['Level2_OrbisName', 'Level2_bvdid']].drop_duplicates().rename(columns={'Level2_OrbisName': 'bvdid_name', 'Level2_bvdid': 'bvdid'})
    OEMs = pd.concat([OEMs1, OEMs2])
    OEMs['bvdid_name'] = OEMs['bvdid_name'].str.strip()
    OEMs = OEMs.drop_duplicates()
    OEMs = OEMs[OEMs['bvdid'].notnull()]
    OEMs.loc[OEMs[OEMs['bvdid'] == 'CN9360062998'].index, 'bvdid_name_cl'] = 'dongfeng motor corp'
    # Clean names
    OEMs['bvdid_name_cl'] = OEMs['bvdid_name'].apply(f_clean)
    LOGGER.info('Number of Orbis names to match: {}'.format(OEMs['bvdid_name_cl'].nunique()))
    return OEMs


def import_factset():
    Factset = pd.read_csv(PATHS.factset / 'COMPANY_withISIN.csv', sep=',', usecols=['NAME', 'ID', 'HOME_REGION'])
    Factset = Factset.drop_duplicates().rename(columns={'NAME': 'fctname', 'ID': 'fctid', 'HOME_REGION': 'fct_home_region'})
    # Clean names
    Factset['fctname'] = Factset['fctname'].apply(f_clean)
    LOGGER.info('Number of factset names available: {}'.format(Factset['fctname'].nunique()))
    # 155 orbis names to match against 331624 factset names
    return Factset


def find_bestmatch(tf_idf_matrix, list_names1, list_names2, n):
    lenlist_names1 = len(list_names1)
    cos = cosine_similarity(tf_idf_matrix[n], tf_idf_matrix[lenlist_names1:], dense_output=True)
    # nlargest(X): keep X highest matches
    cos = pd.Series(cos[0]).nlargest(1).reset_index()
    cos['bvdid_name_cl'] = list_names1[n]
    cos['cos'] = cos[0]
    del cos[0]
    # cos = pd.Series(cos[0]).sort_values(ascending=False).iloc[:3].reset_index()
    cos['fctname'] = cos['index'].apply(lambda x: list_names2[x])
    del cos['index']
    # cos['fuzzy'] = cos.apply(lambda row: fuzz.token_sort_ratio(row['name'], row['orbis']), axis=1)
    return cos


def f_clean(x):
    if type(x) != str:
        return np.nan
    else:
        x = pp.normalize.hyphenated_words(x)
        x = pp.normalize.quotation_marks(x)
        x = pp.normalize.unicode(x)
        x = pp.remove.punctuation(x)
        x = x.lower()
        x = pp.normalize.whitespace(x)
        x = x.replace('joint stock company', 'jsc')
        x = x.replace('public joint stock', 'pjs')
        x = x.replace('limited', 'ltd')
        x = x.replace('aktiengesellschaft', 'ag')
        x = x.replace('aktiebolaget', 'ab')
        x = x.replace('aktiebolag', 'ab')
        x = x.replace('corporation', 'corp')
        x = x.replace(' n v', ' nv')
        x = x.replace(' s p a', ' spa')
        return x


def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


def matching_names(OEMs, Factset):
    LOGGER.info('Begin matching')
    list_names1 = OEMs['bvdid_name_cl'].drop_duplicates().tolist()
    list_names2 = Factset['fctname'].drop_duplicates().tolist()
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tf_idf_matrix = vectorizer.fit_transform(list_names1 + list_names2)
    matches = []
    # n = 0
    for n in range(len(list_names1)):
        # print(n, '/', len(list_names1))
        df = find_bestmatch(tf_idf_matrix, list_names1, list_names2, n)
        matches.append(df)
    matches = pd.concat(matches)
    matches = matches.sort_values(by='cos', ascending=False)[['cos', 'bvdid_name_cl', 'fctname']]
    matches.to_excel(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/temp_matches_oems_fctset.xlsx', index=False)
    LOGGER.info('End of matching - output saved in temp_matches_oems_fctset.csv')
    return matches


def adding_all_relevant_fctset_ids_fromredirect(OEMs):
    # Factset ids can change over time. make sure, all relevant ids are included
    df_Factset_redirect = pd.read_csv(PATHS.factset / 'COMPANY_REDIRECT.csv', sep=',')
    df_Factset_redirect['REF_COMPANY_ID'] = df_Factset_redirect['REF_COMPANY_ID'].replace('None', np.nan)
    df_Factset_redirect['REF_COMPANY_ID'] = df_Factset_redirect['REF_COMPANY_ID'].astype(float)
    df_Factset_redirect['COMPANY_ID'] = df_Factset_redirect['COMPANY_ID'].astype(float)
    list_ids = OEMs[OEMs['fctid'].notnull()]['fctid'].tolist()
    mask1 = df_Factset_redirect['REF_COMPANY_ID'].notnull()
    mask2 = df_Factset_redirect['REASON'].isin(['DUPLICATE', 'ERROR'])
    df_Factset_redirect = df_Factset_redirect[mask1 & mask2 & df_Factset_redirect['COMPANY_ID'].isin(list_ids)]
    df_Factset_redirect = df_Factset_redirect[~df_Factset_redirect['REF_COMPANY_ID'].isin(list_ids)]
    df_Factset_redirect = df_Factset_redirect[['COMPANY_ID', 'REF_COMPANY_ID']].drop_duplicates().merge(OEMs[['fctname', 'fctid']].drop_duplicates(), right_on='fctid', left_on='COMPANY_ID', how='left')
    df_Factset_redirect = df_Factset_redirect[['REF_COMPANY_ID', 'fctname']].rename(columns={'REF_COMPANY_ID': 'fctid'})
    extrarows = OEMs[OEMs['fctid'].isin(df_Factset_redirect['fctid'])]
    extrarows = extrarows.drop(columns=['fctid'])
    extrarows = extrarows.merge(df_Factset_redirect, on='fctname', how='left')
    OEMs = pd.concat([OEMs, extrarows], sort=True)
    OEMs = OEMs.sort_values(by=['bvdid_name', 'bvdid'])
    return OEMs


def adding_all_relevant_fctset_ids_fromaliases(OEMs):
    df_Factset_alias = pd.read_csv(PATHS.factset / 'COMPANY_ALIAS.csv', sep=',')
    list_ids = OEMs[OEMs['fctid'].notnull()]['fctid'].tolist()
    df_Factset_alias = df_Factset_alias[df_Factset_alias['COMPANY_ID'].isin(list_ids)]
    df_Factset_alias = df_Factset_alias[~df_Factset_alias['ALIAS_ID'].isin(list_ids)]
    df_Factset_alias = df_Factset_alias[['COMPANY_ID', 'ALIAS_ID']].drop_duplicates().merge(OEMs[['fctname', 'fctid']].drop_duplicates(), right_on='fctid', left_on='COMPANY_ID', how='left')
    extrarows = OEMs[OEMs['fctid'].isin(df_Factset_alias['fctid'])]
    extrarows = extrarows.drop(columns=['fctid']).drop_duplicates()
    df_Factset_alias = df_Factset_alias[['ALIAS_ID', 'fctname']].rename(columns={'ALIAS_ID': 'fctid'})
    extrarows = extrarows.merge(df_Factset_alias, on='fctname', how='left')
    OEMs = pd.concat([OEMs, extrarows], sort=True)
    OEMs = OEMs.sort_values(by=['bvdid_name', 'bvdid'])
    return OEMs


def main():
    LOGGER.info('SCRIPT: a_adding_factset_ids.py')
    OEMs = import_dataset_oems()
    Factset = import_factset()
    matches = matching_names(OEMs, Factset)
    #################################################
    # MANUAL CHECK
    #################################################
    # Manually check which names got a good match
    # Step 1: Open /Data_outputted/B_FactsetVariables/temp_matches_oems_fctset.xlsx  - outputted by matching_names()
    # Step 2: The xls sheet shows the orbis name in one column and the factset name in another, and the cosine score in a third
    # All the rows with a cosine score of 1 mean the match was perfect
    # Scroll down to the rows where the cosine score is lower than 1
    # Decide up to which point the match is acceptable
    # In previous runs, this was at 0.82
    # The following should not be tagged as a good match: lotus cars ltd //	lotus ltd . that's why the limit is placed at 0.82
    LOGGER.info('Manual decision to keep matches with cos higher than 0.82 plus a few hand picked cases')
    mask1 = matches['cos'] > 0.82
    # There are a few good matches below the 0.82. we're rescuing them by hand.
    list_to_keep = ['scania cv ab', 'jsc avtovaz', 'faw jiefang automotive co ltd', 'maserati spa',
                    'haima automobile company ltd', 'ford otomotiv sanayi anonim sirketi',
                    'china national heavy duty truck group jinan commercial vehicle co ltd']
    mask2 = matches['bvdid_name_cl'].isin(list_to_keep)
    matches = matches[mask1 | mask2]
    del matches['cos']
    OEMs = OEMs.merge(matches, on='bvdid_name_cl', how='left')
    OEMs = OEMs.merge(Factset[['fctname', 'fctid', 'fct_home_region']].drop_duplicates(), on='fctname', how='left')
    OEMs = adding_all_relevant_fctset_ids_fromredirect(OEMs)
    OEMs = adding_all_relevant_fctset_ids_fromaliases(OEMs)
    OEMs.to_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/OEMs_factset.csv', index=False)
    LOGGER.info('Final dataset saved as OEMs_factset.csv')
    return

